Import Libraries¶
In [1]:
import numpy as np
import pandas as pd
import zipfile
import os
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.copy_on_write = True
In [35]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
pio.templates.default = 'plotly_dark'
In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder , MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score ,auc ,roc_auc_score , confusion_matrix
In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
In [5]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
In [37]:
import optuna
import logging
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances ,plot_contour
optuna.logging.set_verbosity(optuna.logging.WARNING)
Read Data¶
In [7]:
def unzip_file_to_same_location(zip_path):
extract_to = os.path.dirname(zip_path)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
print('Extraction Complete')
In [8]:
zip_file_path = 'archive (2).zip'
unzip_file_to_same_location(zip_file_path)
Extraction Complete
In [9]:
df = pd.read_csv('exercise_angles.csv')
In [10]:
num_observations, num_features = df.shape
print(f"Number of observations: {num_observations}")
print(f"Number of features: {num_features}")
Number of observations: 31033 Number of features: 12
In [11]:
side_counts = df['Side'].value_counts()
print("Value counts for 'Side' column:")
print(side_counts)
Value counts for 'Side' column: Side left 31033 Name: count, dtype: int64
In [12]:
df.drop('Side',axis=1,inplace=True)
In [13]:
df.describe()
Out[13]:
| Shoulder_Angle | Elbow_Angle | Hip_Angle | Knee_Angle | Ankle_Angle | Shoulder_Ground_Angle | Elbow_Ground_Angle | Hip_Ground_Angle | Knee_Ground_Angle | Ankle_Ground_Angle | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 31033.000000 | 31033.000000 | 31033.000000 | 31033.000000 | 31033.000000 | 31033.000000 | 31033.000000 | 31033.000000 | 31033.000000 | 31033.000000 |
| mean | 66.522206 | 114.303010 | 137.466151 | 143.273623 | 135.211957 | 88.816743 | 88.926949 | 79.408694 | 75.795121 | 68.985596 |
| std | 60.226756 | 57.906279 | 57.048278 | 48.041715 | 53.304068 | 14.546233 | 13.856550 | 42.359381 | 48.530150 | 57.802208 |
| min | 0.002748 | 0.000974 | 0.006850 | 0.116036 | 0.031297 | -90.000000 | -90.000000 | -90.000000 | -90.000000 | -90.000000 |
| 25% | 17.852184 | 58.900491 | 111.556724 | 123.646144 | 106.740814 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 90.000000 |
| 50% | 40.585632 | 132.999090 | 168.374922 | 168.227063 | 162.926184 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 90.000000 |
| 75% | 121.209005 | 168.769517 | 175.656498 | 177.225089 | 175.735039 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 90.000000 |
| max | 179.991577 | 179.998861 | 179.999848 | 179.999277 | 179.999942 | 90.000000 | 90.000000 | 90.000000 | 90.000000 | 90.000000 |
In [15]:
numeric_columns = df.select_dtypes(include=['float64']).columns
colors = ['#2d288f', '#5342a5', '#735ebb', '#927bd1', '#b199e8',
'#f1cbf9', '#dca7f7', '#bf85f8', '#9666fa', '#554cff']
fig = make_subplots(rows=2, cols=5, subplot_titles=[col.replace('_', ' ') for col in numeric_columns])
for i, col in enumerate(numeric_columns):
row = (i // 5) + 1
col_pos = (i % 5) + 1
fig.add_trace(
go.Histogram(x=df[col], marker_color=colors[i], name=col.replace('_', ' ') , nbinsx=20, showlegend=False),
row=row, col=col_pos
)
fig.update_layout(height=500, width=1100, title_text="Histograms for all body joint angles")
fig.show()
In [16]:
df_melted = df.melt(id_vars='Label', var_name='body joint angle', value_name='Values')
colors = ['#288f11', '#00b79d', '#7fd3e8', '#00a3ff', '#8c3aff']
df_melted['body joint angle'] = df_melted['body joint angle'].str.replace('_', ' ')
fig = px.box(df_melted, x='Label', y='Values', color='Label',
facet_col='body joint angle', facet_col_wrap=2,
color_discrete_sequence=colors)
fig.update_layout(height=1000, width=1200, title="Box Plot for body joint angle by labels")
fig.show()
In [17]:
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])
In [20]:
def preprocess_data(df, label_column):
X = df.drop(columns=[label_column])
y = df[label_column]
first_four_columns = X.columns[:5]
last_four_columns = X.columns[5:10]
transformers = [
('standard_scaler', StandardScaler(), first_four_columns),
('minmax_scaler', MinMaxScaler(), last_four_columns),
]
column_transformer = ColumnTransformer(transformers)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = column_transformer.fit_transform(X_train)
X_test_scaled = column_transformer.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=np.concatenate([first_four_columns, last_four_columns]))
X_test_scaled = pd.DataFrame(X_test_scaled, columns=np.concatenate([first_four_columns, last_four_columns]))
return X_train_scaled,y_train, X_test_scaled, y_test
In [21]:
X_train, y_train, X_test, y_test = preprocess_data(df, 'Label')
In [22]:
def train_classifiers(X_train, y_train, X_test, y_test, random_state=42):
classifiers = {
'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
'Decision Tree': DecisionTreeClassifier(random_state=random_state),
'Random Forest': RandomForestClassifier(random_state=random_state),
'Support Vector Machine': SVC(random_state=random_state),
'K-Nearest Neighbors': KNeighborsClassifier(),
'Naive Bayes': GaussianNB(),
'Gradient Boosting': GradientBoostingClassifier(random_state=random_state),
'Perceptron': Perceptron(random_state=random_state),
'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=random_state),
'LightGBM': LGBMClassifier(random_state=random_state,verbosity=-1),
'CatBoost': CatBoostClassifier(silent=True, random_state=random_state)
}
results = []
for name, clf in classifiers.items():
print(f"Training {name}...")
clf.fit(X_train, y_train)
train_acc = accuracy_score(y_train, clf.predict(X_train))
test_acc = accuracy_score(y_test, clf.predict(X_test))
results.append({'Model': name, 'Train Accuracy': train_acc, 'Test Accuracy': test_acc})
results_df = pd.DataFrame(results).sort_values('Test Accuracy',ascending=False).reset_index(drop=True)
return results_df
In [23]:
results_df = train_classifiers(X_train, y_train, X_test, y_test)
Training Logistic Regression... Training Decision Tree... Training Random Forest... Training Support Vector Machine... Training K-Nearest Neighbors... Training Naive Bayes... Training Gradient Boosting... Training Perceptron... Training Quadratic Discriminant Analysis... Training XGBoost... Training LightGBM... Training CatBoost...
In [24]:
results_df
Out[24]:
| Model | Train Accuracy | Test Accuracy | |
|---|---|---|---|
| 0 | LightGBM | 0.992427 | 0.968745 |
| 1 | Random Forest | 1.000000 | 0.967617 |
| 2 | XGBoost | 0.998187 | 0.966812 |
| 3 | K-Nearest Neighbors | 0.973294 | 0.966006 |
| 4 | CatBoost | 0.982438 | 0.965684 |
| 5 | Decision Tree | 1.000000 | 0.938940 |
| 6 | Gradient Boosting | 0.930718 | 0.920735 |
| 7 | Support Vector Machine | 0.889914 | 0.891091 |
| 8 | Logistic Regression | 0.693547 | 0.697761 |
| 9 | Perceptron | 0.555426 | 0.549863 |
| 10 | Naive Bayes | 0.479981 | 0.483809 |
| 11 | Quadratic Discriminant Analysis | 0.168372 | 0.165781 |
In [27]:
def optimize_lightgbm(X_train_scaled, y_train,X_test_scaled, y_test):
def objective(trial):
param = {
'objective': 'multiclass',
'num_class': len(set(y_train)),
'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),
'verbosity': -1,
'num_leaves': trial.suggest_int('num_leaves', 70, 200),
'max_depth': trial.suggest_int('max_depth', 6, 20), # -1 means no limit
'learning_rate': trial.suggest_loguniform('learning_rate', 0.1, 1.0),
'n_estimators': trial.suggest_int('n_estimators', 20, 400),
'subsample': trial.suggest_float('subsample', 0.2, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 0.5),
'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 100),
}
model = LGBMClassifier(**param, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
return accuracy
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=100)
return study
In [28]:
study = optimize_lightgbm(X_train,y_train, X_test, y_test)
print("Best hyperparameters: ", study.best_params)
Best hyperparameters: {'boosting_type': 'gbdt', 'num_leaves': 155, 'max_depth': 14, 'learning_rate': 0.1652995026469861, 'n_estimators': 266, 'subsample': 0.636504703516293, 'colsample_bytree': 0.969280540948789, 'reg_alpha': 0.0025343248745577033, 'reg_lambda': 0.018495999829183322}
In [48]:
fig = make_subplots(rows=2, cols=1, subplot_titles=("Optimization History", "Parameter Importances"))
scatter_color = 'blue'
line_color = 'green'
fig1 = plot_optimization_history(study)
for trace in fig1.data:
if isinstance(trace, go.Scatter) and 'markers' in trace.mode:
trace.update(marker=dict(color=scatter_color))
elif isinstance(trace, go.Scatter) and 'lines' in trace.mode:
trace.update(line=dict(color=line_color))
fig.add_trace(trace, row=1, col=1)
fig2 = plot_param_importances(study)
bar_color = 'darkblue'
for trace in fig2.data:
if isinstance(trace, go.Bar):
trace.update(marker=dict(color=bar_color))
trace.showlegend = False
fig.add_trace(trace, row=2, col=1)
fig.update_layout(height=800, title_text="Optuna Study Results")
fig.show()
In [49]:
def plot_hyperparameter_contour(study, param1, param2, row, col):
x = np.array([trial.params[param1] for trial in study.trials])
y = np.array([trial.params[param2] for trial in study.trials])
z = np.array([trial.value for trial in study.trials])
contour = go.Contour(
x=x,
y=y,
z=z,
colorscale='Blues',
colorbar=dict(title="Accuracy"),
contours=dict(
start=np.min(z),
end=np.max(z),
size=(np.max(z) - np.min(z)) / 20,
showlabels=False
),
name=f"{param1} vs {param2}"
)
return contour
def plot_all_hyperparameter_contours(study):
fig = make_subplots(rows=2, cols=2, subplot_titles=(
"num_leaves vs max_depth",
"n_estimators vs learning_rate",
"subsample vs colsample_bytree",
"reg_alpha vs reg_lambda"
))
hyperparams = [
('num_leaves', 'max_depth', 1, 1),
('n_estimators', 'learning_rate', 1, 2),
('subsample', 'colsample_bytree', 2, 1),
('reg_alpha', 'reg_lambda', 2, 2)
]
for param1, param2, row, col in hyperparams:
contour = plot_hyperparameter_contour(study, param1, param2, row, col)
fig.add_trace(contour, row=row, col=col)
fig.update_layout(title="Hyperparameter Contour Plots", height=800, width=1000)
fig.show()
In [50]:
plot_all_hyperparameter_contours(study)
In [32]:
best_params = study.best_params
print(f"Re-training LightGBM...")
model = LGBMClassifier(**best_params, random_state=42)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)
print(f"Training Accuracy: {train_accuracy:.5f}")
print(f"Test Accuracy: {test_accuracy:.5f}")
Re-training LightGBM... Training Accuracy: 1.00000 Test Accuracy: 0.97648
In [36]:
cm_df = pd.DataFrame(confusion_matrix_test,
index=label_encoder.inverse_transform(range(confusion_matrix_test.shape[0])),
columns=label_encoder.inverse_transform(range(confusion_matrix_test.shape[1])))
fig = ff.create_annotated_heatmap(
z=cm_df.values,
x=cm_df.columns.tolist(),
y=cm_df.index.tolist(),
colorscale='Blues',
)
fig.update_layout(
title='Confusion Matrix Heatmap',
xaxis_title='Predicted Labels',
yaxis_title='True Labels',
)
fig.show()
In [51]:
feature_importances = model.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({
'Feature': features,
'Importance': feature_importances
})
importance_df['Normalized Importance'] = importance_df['Importance'] / importance_df['Importance'].sum()
importance_df = importance_df.sort_values(by='Normalized Importance', ascending=True)
fig = go.Figure()
fig.add_trace(go.Bar(
x=importance_df['Normalized Importance'],
y=importance_df['Feature'],
orientation='h',
marker=dict(color='darkblue')
))
fig.update_layout(
title='Normalized Feature Importance',
xaxis_title='Normalized Importance Score',
yaxis_title='Features',
showlegend=False
)
fig.show()